#-------------------------------------------------------------------------------
# ANALYSIS OF T CELL POLYFUNCTIONALITY DATA
#-------------------------------------------------------------------------------
library(tidyverse)
library(gamlss)
library(emmeans)
library(RColorBrewer)
library(cowplot)
library(writexl)
library(DT)
map <- purrr::map
# functions to run EM for choosing threshold
source("scripts/utils/normal_beta_em_utils.R")
source("scripts/utils/utils.R")
modernacolors()set.seed(43210)
d <- readxl::read_xlsx("processed_data/polyfunctional_tcell.xlsx") %>%
dplyr::filter(!str_detect(Stimulation, "PMA")) %>%
dplyr::filter(!is.na(dose) & dose > 0)As described in the Statistical Methods materials, we first need to threshold the polyfunctional T cell data, in order to capture data from the null (i.e., truly zero) and non-null (i.e., some positive proportion of polyfunctional cells in the sample) populations. This is done using a normal-beta mixture model, where the normal distribution, centered at 0, captures the null component, and the beta distribution captures the non-null component.
Note that we are reporting “Aggregate composition.” This means, among the 3 cytokines we are studying, we are agnostic to which ones are positively expressed. We are only looking for the number of positively expressed cytokines. So, we sum across compositions that are single producers, and sum across compositions that are dual producers, to generate an “aggregate composition” of single and dual producers.
The below script estimates these models within each relevant
Stimulations/Cell population group. The resulting parameter estimates
are displayed. prop0 is the mixing proportion of the null
population. mu and sigma are the mean and
standard deviation of the normal component. shape1 and
shape2 and the shape parameters of the beta
distribution.
#-------------------------------------------------------------------------------
# Estimate a distribution for background and signal data using a normal-beta
# mixture model. This is motivated by the SPICE paper. Estimation is done for
# cell type/day specific data. Do estimation using the EM algorithm.
#-------------------------------------------------------------------------------
spl <- group_split(d, cell_population, day) %>%
purrr::map(~ {
if(.x$cell_population[1] == "CD4 T") {
dplyr::filter(.x, Stimulation == "S2")
} else if(.x$cell_population[1] == "CD8 T") {
dplyr::filter(.x, Stimulation == "S1")
}
})
names(spl) <-
unlist(group_keys(group_by(d, cell_population, day)) %>% unite(col = nm))
em_fits <-
purrr::map(spl,
~ fit_beta_normal_mixture(.x$agg_composition, mu_equal_zero = TRUE))
purrr::imap_dfr(em_fits,
~ as_tibble(.x$parameters) %>% mutate(nm = .y, .before = 1)) %>%
separate(nm, c("Cell population", "Stimulation"), sep = "_") %>%
knitr::kable()| Cell population | Stimulation | prop0 | mu | sigma | shape1 | shape2 |
|---|---|---|---|---|---|---|
| CD4 T | 64 | 0.0461383 | 0 | 0.0005177 | 2.5741847 | 2736.2961 |
| CD4 T | 87 | 0.0575217 | 0 | 0.0010990 | 1.6881778 | 1791.6222 |
| CD4 T | 143 | 0.1394895 | 0 | 0.0017161 | 3.4336328 | 4276.7515 |
| CD4 T | 226 | 0.1018359 | 0 | 0.0009028 | 2.5371268 | 6270.7086 |
| CD8 T | 64 | 0.0432076 | 0 | 0.0002053 | 0.5476772 | 106.3619 |
| CD8 T | 87 | 0.0671753 | 0 | 0.0018250 | 0.6725799 | 161.8807 |
| CD8 T | 143 | 0.0115484 | 0 | 0.0024120 | 1.1334973 | 186.8061 |
| CD8 T | 226 | 0.0000000 | 0 | 0.0010402 | 0.9915519 | 255.5449 |
We define the threshold as the point at which observations have over a 95% chance of belonging to the non-null component, based on the model fit. The below plots visualize the estimated threshold (purple dashed line), as well as the histogram of fitted and observed models.
#-------------------------------------------------------------------------------
# Plot estimated mixture distribution vs. the real data
# Calculate model-driven threshold in the process
# Keep as non-zero the observations which have >= 95% chance of belonging to
# the signal distribution. This is motivated as the appropriate threshold for
# smaller datasets in the SPICE paper.
#-------------------------------------------------------------------------------
mixture_density_plots <- list()
for (i in seq_along(spl)) {
sim <-
sample(
0:1,
replace = TRUE,
size = 10000,
prob = c(em_fits[[i]]$parameters$prop0, 1 - em_fits[[i]]$parameters$prop0)
) %>%
tibble(sim_dat = c(
rbeta(
n = sum(. == 1),
shape1 = em_fits[[i]]$parameters$shape1,
shape2 = em_fits[[i]]$parameters$shape2
),
rnorm(
n = sum(. == 0),
mean = em_fits[[i]]$parameters$mu,
sd = em_fits[[i]]$parameters$sigma
)
),
group = c(rep("non-zero", sum(. == 1)), rep("zero", sum(. == 0)))) %>%
dplyr::select(-1)
thresh <-
dplyr::filter(spl[[i]], em_fits[[i]]$z[, "non-zero"] >= .95) %>%
pull(agg_composition) %>%
min()
spl[[i]] <- mutate(spl[[i]], thresh = thresh)
mixture_density_plots[[i]] <-
ggplot(spl[[i]], aes(x = agg_composition, y = ..density..)) +
geom_histogram(
color = "grey40",
fill = "white",
bins = 20,
size = 1.1
) +
geom_density(
data = filter(sim, group == "zero"),
aes(
x = sim_dat,
y = ..density.. * em_fits[[i]]$parameters$prop0,
color = group,
fill = group,
group = group
),
alpha = 0.3,
size = 1.1
) +
geom_density(
data = filter(sim, group == "non-zero"),
aes(
x = sim_dat,
y = ..density.. * (1 - em_fits[[i]]$parameters$prop0),
color = group,
fill = group,
group = group
),
alpha = 0.3,
size = 1.1
) +
scale_fill_manual(values = c(modernalightgray, modernared)) +
scale_color_manual(values = c(modernalightgray, modernared)) +
geom_vline(
aes(xintercept = thresh),
color = modernapurple,
size = 1.2,
linetype = "twodash"
) +
labs(
y = "",
x = "Aggregate composition",
fill = "Estimated\ndistribution",
color = "Estimated\ndistribution"
) +
ggtitle(paste0(
spl[[i]]$cell_population[1],
" cells, Day ",
spl[[i]]$day[1],
", ",
spl[[i]]$Stimulation[1]
)) +
theme_minimal()
}
leg <- cowplot::get_legend(mixture_density_plots[[1]])
mixture_density_plots <-
map(mixture_density_plots, ~ .x + theme(legend.position = "none"))
pgrid <-
cowplot::plot_grid(plotlist = mixture_density_plots,
nrow = 2,
ncol = 4)
pgrid_out <-
cowplot::plot_grid(
pgrid,
leg,
nrow = 1,
ncol = 2,
rel_widths = c(10, 1)
)
print(pgrid_out)The estimated thresholds for each cell population/stimulation combination are displayed below.
#-------------------------------------------------------------------------------
# Table of thresholds
#-------------------------------------------------------------------------------
# Extract mixture model thresholds
threshd_mix <- bind_rows(spl) %>%
mutate(
thresh_agg_composition = case_when(
agg_composition < thresh ~ 0,
agg_composition >= thresh ~ agg_composition
)
)
thresh_table <-
dplyr::select(threshd_mix, cell_population, Stimulation, day, thresh) %>%
arrange(cell_population, Stimulation, day) %>%
distinct() %>%
setNames(c("Cell population", "Stimulation", "Day", "threshold"))
knitr::kable(thresh_table)| Cell population | Stimulation | Day | threshold |
|---|---|---|---|
| CD4 T | S2 | 64 | 0.0002900 |
| CD4 T | S2 | 87 | 0.0000942 |
| CD4 T | S2 | 143 | 0.0003194 |
| CD4 T | S2 | 226 | 0.0000858 |
| CD8 T | S1 | 64 | 0.0004000 |
| CD8 T | S1 | 87 | 0.0000000 |
| CD8 T | S1 | 143 | 0.0001900 |
| CD8 T | S1 | 226 | 0.0000703 |
#-------------------------------------------------------------------------------
# Boxplots of aggregate composition across weeks post dose, by day
#-------------------------------------------------------------------------------
p_boxplot2 <- group_split(threshd_mix, cell_population) %>%
purrr::map(
~ ggplot(.x, aes(x = weeks_post_dose, y = thresh_agg_composition, group = interaction(factor(num_positive), weeks_post_dose))) +
geom_boxplot(aes(color = factor(num_positive)), outlier.shape = NA) +
geom_jitter(aes(color = factor(num_positive)), position = position_jitterdodge(), pch = 21) +
scale_color_manual(values = c(modernaorange, modernablue, modernapurple, modernagreen)) +
scale_y_continuous(trans = "sqrt") +
facet_grid(day ~ Stimulation,
labeller = labeller("day" = function(.x) paste0("Day ", .x))) +
labs(x = "Weeks post dose", y = "Aggregate composition", title = paste0(.x$cell_population[1], " cells"), color = "# positive") +
theme_bw()
)
walk(p_boxplot2, print)#-------------------------------------------------------------------------------
# Stacked bar charts of average (across mice within group) aggregate composition
# across weeks post dose
#-------------------------------------------------------------------------------
barp <- threshd_mix %>%
group_split(cell_population, group, weeks_post_dose, day, Stimulation, num_positive) %>%
map_dfr(~ mutate(.x, agg_composition=mean(agg_composition), mouse_id = NULL) %>% distinct()) %>%
group_split(cell_population) %>%
purrr::map(
~ ggplot(.x, aes(x = weeks_post_dose, y = thresh_agg_composition, group = interaction(factor(num_positive), weeks_post_dose))) +
geom_bar(aes(fill = factor(num_positive)), position = "stack", stat = "identity") +
scale_fill_manual(values = brewer.pal(n = 9, "BuGn")[c(4,6,9)]) +
facet_grid(day ~ Stimulation,
labeller = labeller("num_positive" = function(.x) paste0(.x, " positive"), "day" = function(.x) paste0("Day ", .x))) +
labs(x = "Weeks post dose", y = "Mean aggregate composition", title = paste0(.x$cell_population[1], " cells"), fill = "# positive") +
theme_bw()
)
walk(barp, print)We use a zero-inflated beta regression model to model the proportion of single, dual, and triple cytokine expressors. We fit models for CD4+ and CD8+ T cells, and for single, dual, and triple expressors separately, resulting in 6 total models.
#-------------------------------------------------------------------------------
# Multivariate mixed modeling
#-------------------------------------------------------------------------------
d4 <- filter(threshd_mix, cell_population == "CD4 T") %>%
mutate(agg_composition = NULL) %>%
pivot_wider(id_cols = everything(), names_from = "num_positive", values_from = "thresh_agg_composition") %>%
rename("one" = "1", "two" = "2", "three" = "3")
d8 <- filter(threshd_mix, cell_population == "CD8 T") %>%
mutate(agg_composition = NULL) %>%
pivot_wider(id_cols = everything(), names_from = "num_positive", values_from = "thresh_agg_composition") %>%
rename("one" = "1", "two" = "2", "three" = "3")
f4 <- purrr::map(c("one", "two", "three"), ~ {
if(.x == "one") {
form <- as.formula(paste0(.x, " ~ factor(day) * factor(weeks_post_dose)"))
} else {
form <- as.formula(paste0(.x, " ~ factor(day) + factor(weeks_post_dose)"))
}
sigma_form <- as.formula("~ factor(weeks_post_dose)")
gamlss::gamlss(
form,
sigma.formula = sigma_form,
family = BEZI,
data = d4,
trace = FALSE,
control = gamlss.control(n.cyc = 80, trace = FALSE),
i.control = glim.control(glm.trace = FALSE, bf.trace = FALSE)
)
}) %>%
setNames(c("one", "two", "three"))
f8 <- purrr::map(c("one", "two", "three"), ~ {
if(.x == "one") {
form <- as.formula(paste0(.x, " ~ factor(day) * factor(weeks_post_dose)"))
} else {
form <- as.formula(paste0(.x, " ~ factor(day) + factor(weeks_post_dose)"))
}
sigma_form <- as.formula("~ factor(weeks_post_dose)")
gamlss::gamlss(
form,
sigma.formula = sigma_form,
family = BEZI,
data = d8,
trace = FALSE,
control = gamlss.control(n.cyc = 80, trace = FALSE),
i.control = glim.control(glm.trace = FALSE, bf.trace = FALSE)
)
}) %>%
setNames(c("one", "two", "three"))
ds <- ls(pattern = "d[[:digit:]]") %>% lapply(function(x) eval(as.name(x)))
names(ds) <- ls(pattern = "d[[:digit:]]")
fits <- ls(pattern = "f[[:digit:]]") %>% lapply(function(x) eval(as.name(x)))
names(fits) <- ls(pattern = "f[[:digit:]]")Since the data are modeled using a generalized linear model assuming a distribution that contains both discrete and continuous components, normalized randomized quantile residuals (RQRs) are used in residual diagnostic plots.
par(mfrow = c(3,2))
iwalk(f4, ~ {
plot(fitted(.x), residuals(.x), xlab = "Fitted values", ylab = "RQRs", main = paste0("Residual plots for CD4+ T cells with ", .y, "\ncytokine(s) expressed"))
abline(h = 0, col = "tomato2", lty = 2)
qqnorm(residuals(.x));
qqline(residuals(.x))
})par(mfrow = c(3,2))
iwalk(f8, ~ {
plot(fitted(.x), residuals(.x), xlab = "Fitted values", ylab = "RQRs", main = paste0("Residual plots for CD8+ T cells with ", .y, "\ncytokine(s) expressed"))
abline(h = 0, col = "tomato2", lty = 2)
qqnorm(residuals(.x));
qqline(residuals(.x))
})#-------------------------------------------------------------------------------
# Differences in consecutive dosing intervals
#-------------------------------------------------------------------------------
em_fits1 <- imap(fits, ~ {
lapply(.x, function(X) {
emmeans::emmeans(
ref_grid(X),
specs = ~ weeks_post_dose | day
) %>%
contrast(method = "revpairwise", type = "response") %>%
confint(level = 0.95, adjust = "mvt")
})
})
names(em_fits1) <- names(fits)
contrast_plots1 <- imap(em_fits1, ~ {
titles <- names(.x)
out <- list()
for(i in seq_along(titles)) {
out[[i]] <- plot(.x[[i]]) +
geom_vline(aes(xintercept = 1), linetype = "twodash", color = "red") +
facet_grid(~ day, labeller = labeller(day = function(.y) paste0("Day ", .y))) +
labs(x = "Odds ratio", y = "Contrast in dosing interval") +
ggtitle(paste0("CD", str_extract(.y, "[[:digit:]]"), " T cells, ", titles[i], " function(s)")) +
theme_bw()
}
out
})
walk(contrast_plots1, print)## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[1]]
##
## [[2]]
##
## [[3]]
#-------------------------------------------------------------------------------
# Tables of P-values to go with the figure
#-------------------------------------------------------------------------------
em_tables <- imap(fits, ~ {
lapply(.x, function(X) {
emmeans::emmeans(
ref_grid(X),
specs = ~ weeks_post_dose | day
) %>%
contrast(method = "revpairwise", type = "response", adjust = "mvt")
})
})
pvals <- imap_dfr(em_tables, ~ {
imap_dfr(.x, function(.xx, .yy) {
as_tibble(.xx) %>%
mutate(num_cytokines = .yy)
}) %>%
mutate(tcell = .y)
}) %>%
dplyr::select(contrast, odds.ratio, day, tcell, num_cytokines, p.value) %>%
rename(
"Pairwise comparison" = "contrast",
"Odds ratio" = "odds.ratio",
"Day" = "day",
"Cell" = "tcell",
"# positive" = "num_cytokines",
"Adjusted P-value" = p.value
) %>%
mutate(
Cell = case_when(
Cell == "f4" ~ "CD4+ T",
Cell == "f8" ~ "CD8+ T"
),
`# positive` = case_when(
`# positive` == "one" ~ 1,
`# positive` == "two" ~ 2,
`# positive` == "three" ~ 3
)) %>%
filter(`Adjusted P-value` <= 0.05)
DT::datatable(pvals)sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin19.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /usr/local/Cellar/openblas/0.3.19/lib/libopenblasp-r0.3.19.dylib
## LAPACK: /usr/local/Cellar/r/4.1.2/lib/R/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel splines stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] mclust_5.4.9 magrittr_2.0.2 DT_0.21 writexl_1.4.0
## [5] cowplot_1.1.1 RColorBrewer_1.1-2 emmeans_1.7.4-1 gamlss_5.4-1
## [9] nlme_3.1-155 gamlss.dist_6.0-3 MASS_7.3-55 gamlss.data_6.0-2
## [13] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.9 purrr_0.3.4
## [17] readr_2.1.2 tidyr_1.2.0 tibble_3.1.7 ggplot2_3.3.6
## [21] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] fs_1.5.2 bit64_4.0.5 lubridate_1.8.0 httr_1.4.2
## [5] tools_4.1.2 backports_1.4.1 bslib_0.3.1 utf8_1.2.2
## [9] R6_2.5.1 DBI_1.1.2 colorspace_2.0-3 withr_2.5.0
## [13] tidyselect_1.1.2 bit_4.0.4 compiler_4.1.2 textshaping_0.3.6
## [17] cli_3.3.0 rvest_1.0.2 xml2_1.3.3 sandwich_3.0-1
## [21] labeling_0.4.2 sass_0.4.0 scales_1.2.0 mvtnorm_1.1-3
## [25] systemfonts_1.0.4 digest_0.6.29 rmarkdown_2.13 pkgconfig_2.0.3
## [29] htmltools_0.5.2 dbplyr_2.1.1 fastmap_1.1.0 highr_0.9
## [33] htmlwidgets_1.5.4 rlang_1.0.2 readxl_1.3.1 rstudioapi_0.13
## [37] farver_2.1.0 jquerylib_0.1.4 generics_0.1.2 zoo_1.8-9
## [41] jsonlite_1.8.0 crosstalk_1.2.0 vroom_1.5.7 Matrix_1.4-0
## [45] Rcpp_1.0.8.3 munsell_0.5.0 fansi_1.0.3 lifecycle_1.0.1
## [49] stringi_1.7.3 multcomp_1.4-18 yaml_2.3.5 grid_4.1.2
## [53] crayon_1.5.1 lattice_0.20-45 haven_2.4.3 hms_1.1.1
## [57] knitr_1.37 pillar_1.7.0 estimability_1.3 codetools_0.2-18
## [61] reprex_2.0.1 glue_1.6.2 evaluate_0.15 modelr_0.1.8
## [65] vctrs_0.4.1 tzdb_0.2.0 cellranger_1.1.0 gtable_0.3.0
## [69] assertthat_0.2.1 xfun_0.30 xtable_1.8-4 broom_0.7.12
## [73] coda_0.19-4 ragg_1.2.1 survival_3.2-13 TH.data_1.1-0
## [77] ellipsis_0.3.2